#! /usr/bin/env python
# -*- coding: utf-8

##	  Copyright 2012, Geoffrey GROFF and Dimitri SEGARD
##	  This file is part of otaku-loader.
##
##    otaku-loader is free software: you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation, either version 3 of the License, or
##    (at your option) any later version.
##
##    otaku-loader is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License
##    along with otaku-loader.  If not, see <http://www.gnu.org/licenses/>.

import re

p_tag = re.compile(r'<.*?>')
p_spaces = re.compile(r'\s+')
p_clean = re.compile('(\n|\t|\r|\r\n)', re.IGNORECASE)	
p_amper = re.compile('&[a-z0-9#]+;', re.IGNORECASE)

def strip_tags(data):
	return p_tag.sub('', data)

def remove_extra_spaces(data):
	return p_spaces.sub(' ', data)

def clean(str):	
	str = re.sub(p_clean,' ', str.strip())	
	str = re.sub(p_amper, ' ', str)
	str = str.replace(chr(0x92), "'")
	str = str.replace(chr(0x9c), "oe")
	return str.replace("\\'", "'")

def full_clean(str):
	return strip_tags(remove_extra_spaces(clean(str)))
